What this does: Loads OECD-CRS (TXT, one file per year) and GODAD (project, ADM1, ADM2), harmonizes fields, restricts to 2000–2020, and produces diagnostics & visuals: totals over time, coverage heatmap, sector share deltas, identity checks across levels (project/ADM1/ADM2), concentration (Gini), unspecified-region shares, splitting behavior, and an ADM1 map for a case country. Parameters above let you tweak quickly.

Expected files (place them into params$godad_dir):

CRS files should be named like: CRS <year> Data.txt inside params$crs_dir.

0) Setup

# R pkgs
need <- c("reticulate","dplyr","stringr","forcats","ggplot2","scales","tibble","purrr","uwot","stopwords")
to_install <- setdiff(need, rownames(installed.packages()))
if(length(to_install)) install.packages(to_install, quiet=TRUE)
invisible(lapply(need, library, character.only = TRUE))

# R pkgs
n
## function () 
## {
##     peek_mask()$get_current_group_size()
## }
## <bytecode: 0x10bf9a030>
## <environment: namespace:dplyr>
# Helpers
fmt_dollar <- scales::label_dollar(scale_cut = scales::cut_short_scale())
theme_beau <- function(base_size=12){
  ggplot2::theme_minimal(base_size = base_size) +
    theme(plot.title=element_text(face="bold"),
          plot.subtitle=element_text(color="#555"),
          plot.caption=element_text(color="#777", size=9),
          panel.grid.minor=element_blank(),
          legend.position="top")
}
options(stringsAsFactors = FALSE)

# Faster strings
options(stringsAsFactors = FALSE)

fmt_dollar <- scales::label_dollar(scale_cut = scales::cut_short_scale())  # $1k, $1M, ...
fmt_si     <- scales::label_number(scale_cut = scales::cut_si(""))         # 1k, 1M, 1G
fmt_pct    <- scales::label_percent(accuracy = 0.1)

# Access parameters explicitly
data_file   <- params$data_file
amount_kind <- params$amount
use_crs <- params$map_crs

1) Load & prepare (project-level only)

stopifnot(file.exists(data_file))

# Select only columns we'll actually use to save memory
cols_needed <- c(
  "project_id","project_location_id","donor",
  "gid_0","gid_1","gid_2","name_0","name_1","name_2",
  "latitude","longitude","startyear","closingyear","paymentyear",
  "title","description","receiving_agencies","implementing_agencies","funding_agencies",
  "financial_type","flow_class","status","precision_code","location_count",
  "sector_codes","sector_categories","sector_name",
  "comm","comm_nominal","comm_ibrd","comm_ida",
  "disb","disb_nominal","disb_ibrd","disb_ida",
  "comm_loc_evensplit","disb_loc_evensplit","purpose_code","place_name"
)

proj_raw <- vroom::vroom(
  data_file,
  col_select = any_of(cols_needed),
  altrep = TRUE,
  progress = TRUE,
  num_threads = max(1, parallel::detectCores() - 1),
  show_col_types = FALSE
) %>% janitor::clean_names()
## indexing GODAD_projectlevel.csv [-------------------------] 302.36GB/s, eta:  0sindexing GODAD_projectlevel.csv [=================----------] 1.91GB/s, eta:  0sindexing GODAD_projectlevel.csv [==================---------] 1.91GB/s, eta:  0sindexing GODAD_projectlevel.csv [===================--------] 1.90GB/s, eta:  0sindexing GODAD_projectlevel.csv [====================-------] 1.90GB/s, eta:  0sindexing GODAD_projectlevel.csv [====================-------] 1.87GB/s, eta:  0sindexing GODAD_projectlevel.csv [=====================------] 1.88GB/s, eta:  0sindexing GODAD_projectlevel.csv [=======================----] 1.86GB/s, eta:  0sindexing GODAD_projectlevel.csv [=======================----] 1.83GB/s, eta:  0sindexing GODAD_projectlevel.csv [========================---] 1.81GB/s, eta:  0sindexing GODAD_projectlevel.csv [=========================--] 1.78GB/s, eta:  0sindexing GODAD_projectlevel.csv [==========================-] 1.73GB/s, eta:  0sindexing GODAD_projectlevel.csv [===========================] 1.67GB/s, eta:  0sindexing GODAD_projectlevel.csv [===========================] 1.68GB/s, eta:  0s                                                                                
# Basic cleaning & derived fields
proj <- proj_raw %>%
  mutate(
    iso3_rec = toupper(gid_0),                              # GADM country code = ISO3
    year_commit = suppressWarnings(as.integer(startyear)),  # commitment-ish year
    year_disb   = suppressWarnings(as.integer(paymentyear)),
    amount_commit = suppressWarnings(as.numeric(comm)),
    amount_disb   = suppressWarnings(as.numeric(disb)),
    has_coords    = !is.na(latitude) & !is.na(longitude),
    desc_len      = nchar(description %||% "")
  )

# Which year dimension to prefer in global charts
proj <- proj %>%
  mutate(year = if (amount_kind == "disb") year_disb else year_commit)

# Quick snapshot
list(
  n_rows = nrow(proj),
  time_window_commit = range(proj$year_commit, na.rm=TRUE),
  time_window_disb = range(proj$year_disb, na.rm=TRUE),
  pct_with_desc = mean(!is.na(proj$description) & nzchar(proj$description), na.rm=TRUE)
)
## $n_rows
## [1] 903253
## 
## $time_window_commit
## [1] 1900 2023
## 
## $time_window_disb
## [1] 1989 2024
## 
## $pct_with_desc
## [1] 0.9505078

2) Years available & activity over time

# --- Project counts over time ---


ts_counts <- bind_rows(
  proj %>% count(year_commit, name="n") %>% mutate(type="Projects (commit-year)", year=year_commit),
  proj %>% count(year_disb,   name="n") %>% mutate(type="Projects (disb-year)",   year=year_disb)
) %>% filter(!is.na(year)) %>% arrange(year)


yr_min <- min(ts_counts$year, na.rm = TRUE)
yr_max <- max(ts_counts$year, na.rm = TRUE)
n_obs  <- scales::comma(nrow(proj))

sub_txt <- paste0("Project–location counts by year • ", yr_min, "–", yr_max)
cap_txt <- "Lines show counts of project–location records by year, using either commitment or disbursement year. \
Counts are raw and not adjusted for project size or funding."

p_ts_n <- ggplot(ts_counts, aes(year, n, color = type)) +
  geom_line(linewidth = 0.9) +
  scale_color_manual(values = c("Projects (commit-year)" = "#3182bd", "Projects (disb-year)" = "#e6550d")) +
  scale_y_continuous(labels = fmt_si, expand = expansion(mult = c(0.02, 0.08))) +
  scale_x_continuous(expand = expansion(mult = c(0.01, 0.01))) +
  labs(
    title    = "Project counts over time",
    subtitle = sub_txt,
    x = NULL, y = "Number of project–locations",
    color = NULL,
    caption = stringr::str_wrap(cap_txt, width = 95)
  ) +
  theme_beau() +
  theme(
    legend.position = "top",
    panel.grid.minor = element_blank(),
    plot.margin = margin(5, 10, 35, 5)
  )
p_ts_n

# --- Financial volumes over time ---

ts_amounts <- bind_rows(
  proj %>% group_by(year_commit) %>% summarise(total = sum(amount_commit, na.rm=TRUE), .groups="drop") %>%
    mutate(type="Total commitments (USD)", year=year_commit),
  proj %>% group_by(year_disb) %>% summarise(total = sum(amount_disb, na.rm=TRUE), .groups="drop") %>%
    mutate(type="Total disbursements (USD)", year=year_disb)
) %>% filter(!is.na(year))

yr_min2 <- min(ts_amounts$year, na.rm = TRUE)
yr_max2 <- max(ts_amounts$year, na.rm = TRUE)

sub_txt2 <- paste0("Annual totals (USD) • ", yr_min2, "–", yr_max2)
cap_txt2 <- "Lines show the sum of all commitments and disbursements by year (project–locations aggregated). \
Totals are not deflated or normalized."

p_ts_amt <- ggplot(ts_amounts, aes(year, total, color = type)) +
  geom_line(linewidth = 0.9) +
  scale_color_manual(values = c("Total commitments (USD)" = "#3182bd", "Total disbursements (USD)" = "#e6550d")) +
  scale_y_continuous(labels = fmt_dollar, expand = expansion(mult = c(0.02, 0.08))) +
  scale_x_continuous(expand = expansion(mult = c(0.01, 0.01))) +
  labs(
    title    = "Financial volumes over time",
    subtitle = sub_txt2,
    x = NULL, y = "USD",
    color = NULL,
    caption = stringr::str_wrap(cap_txt2, width = 95)
  ) +
  theme_beau() +
  theme(
    legend.position = "top",
    panel.grid.minor = element_blank(),
    plot.margin = margin(5, 10, 35, 5)
  )
p_ts_amt

Coverage heat (counts) by year × donor (top 20 donors)

proj_yeared <- proj %>%
  mutate(year = if (amount_kind == "disb") year_disb else year_commit) %>%
  filter(!is.na(year), !is.na(donor), nzchar(donor))

# Top donors by count (or keep your original selection)
top_donors <- proj_yeared %>%
  count(donor, name = "n") %>%
  slice_max(n, n = 20) %>%
  pull(donor)

yr_min <- min(proj_yeared$year, na.rm = TRUE)
yr_max <- max(proj_yeared$year, na.rm = TRUE)

# Absolute counts (with full year grid)
donor_heat <- proj_yeared %>%
  filter(donor %in% top_donors) %>%
  count(donor, year, name = "n") %>%
  group_by(donor) %>%
  tidyr::complete(year = seq(yr_min, yr_max, 1), fill = list(n = 0)) %>%
  mutate(tot = sum(n)) %>%
  ungroup() %>%
  mutate(donor = forcats::fct_reorder(donor, tot))

# Normalized (per-donor share)
donor_heat_norm <- donor_heat %>%
  group_by(donor) %>%
  mutate(share = n / pmax(tot, 1)) %>%
  ungroup()

# -------- Plot ABSOLUTE COUNTS --------
ggplot(donor_heat, aes(year, donor, fill = n)) +
  geom_tile(color = "white", linewidth = 0.15) +
  scale_fill_viridis_c(labels = fmt_si, name = "Projects", trans = "sqrt") +
  scale_x_continuous(expand = expansion(mult = c(0.002, 0.002))) +
  labs(title = "Activity heatmap — top donors by project count",
       subtitle = paste0("Top 20 donors • ", yr_min, "–", yr_max),
       x = NULL, y = NULL,
       caption = stringr::str_wrap("Tiles show number of project–locations per donor & year; missing years filled with 0.", 95)) +
  theme_beau() +
  theme(panel.grid = element_blank(), legend.position = "right",
        plot.margin = margin(5, 10, 35, 5))

# -------- Plot NORMALIZED SHARES --------
ggplot(donor_heat_norm, aes(year, donor, fill = share)) +
  geom_tile(color = "white", linewidth = 0.15) +
  scale_fill_viridis_c(labels = scales::label_percent(accuracy = 1), name = "Share of donor’s projects") +
  scale_x_continuous(expand = expansion(mult = c(0.002, 0.002))) +
  labs(title = "Activity heatmap — per-donor share by year",
       subtitle = paste0("Each row scaled to that donor’s total • ", yr_min, "–", yr_max),
       x = NULL, y = NULL,
       caption = stringr::str_wrap("Highlights relative hot years within each donor; not comparable across donors.", 95)) +
  theme_beau() +
  theme(panel.grid = element_blank(), legend.position = "right",
        plot.margin = margin(5, 10, 35, 5))

3) Project descriptions (quality & content)

desc_stats <- tibble(
  `Non-missing descriptions` = mean(!is.na(proj$description) & nzchar(proj$description)),
  `Median description length` = median(proj$desc_len[proj$desc_len>0], na.rm=TRUE),
  `P90 description length`    = quantile(proj$desc_len[proj$desc_len>0], 0.9, na.rm=TRUE)
)

DT::datatable(round(desc_stats, 3), caption = "Description coverage & length")

Description length distribution (by donor, top 10 donors)

top10_donors <- proj %>%
  filter(!is.na(donor) & nzchar(donor)) %>%
  count(donor, wt = amount_commit, sort = TRUE) %>%
  slice_head(n = 10) %>%
  pull(donor)

dlen <- proj %>%
  filter(donor %in% top10_donors, desc_len > 0) %>%
  mutate(desc_len = pmin(desc_len, 1200L))        # clamp long tails for readability

# stats for subtitle/caption
sub_txt <- paste0(
  "Top 10 donors by committed amount • ",
  scales::comma(nrow(dlen)), " descriptions • length truncated at 1,200 chars"
)
cap_txt <- "Each panel shows the density of description lengths for a donor. \
Vertical line marks the median length. Extremely long descriptions are truncated at 1,200 characters for readability."

# per-donor medians
meds <- dlen %>%
  group_by(donor) %>%
  summarise(med = median(desc_len, na.rm = TRUE), .groups = "drop")

p_desc <- ggplot(dlen, aes(desc_len, after_stat(density), color = donor)) +
  geom_freqpoly(bins = 60, alpha = 0.85, linewidth = 0.7, show.legend = FALSE) +
  geom_vline(data = meds, aes(xintercept = med, color = donor), linetype = 2, linewidth = 0.6, show.legend = FALSE) +
  coord_cartesian(xlim = c(0, 1200)) +
  facet_wrap(~ donor, ncol = 2, scales = "free_y") +
  labs(
    title    = "Project description length distribution",
    subtitle = sub_txt,
    x = "Characters", y = "Density",
    caption  = stringr::str_wrap(cap_txt, width = 95)
  ) +
  theme_beau() +
  theme(
    panel.grid.minor = element_blank(),
    strip.text = element_text(face = "bold"),
    plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
  )

p_desc

if (!requireNamespace("ggridges", quietly = TRUE)) install.packages("ggridges", quiet = TRUE)
library(ggridges)

p_desc_ridge <- ggplot(dlen, aes(x = desc_len, y = reorder(donor, desc_len, median), fill = donor)) +
  ggridges::geom_density_ridges(scale = 1.1, rel_min_height = 0.01, alpha = 0.85, color = "white", linewidth = 0.2, show.legend = FALSE) +
  geom_vline(data = meds, aes(xintercept = med), linetype = 2, linewidth = 0.6, color = "grey30") +
  coord_cartesian(xlim = c(0, 1200)) +
  labs(
    title    = "Project description length distribution (ridgelines)",
    subtitle = sub_txt,
    x = "Characters", y = NULL,
    caption  = stringr::str_wrap(cap_txt, width = 95)
  ) +
  theme_beau() +
  theme(panel.grid.minor = element_blank())

p_desc_ridge
## Picking joint bandwidth of 20

Top tokens in descriptions (TF-IDF by donor, quick view)

# ---- TF–IDF by donor (clean & robust) ----
if (!"tidytext" %in% rownames(installed.packages())) install.packages("tidytext", quiet = TRUE)
if (!"stopwords" %in% rownames(installed.packages())) install.packages("stopwords", quiet = TRUE)
library(tidytext); library(stopwords)

# 1) multilingual stopwords + domain terms
stop_tbl <- tibble::tibble(
  token = unique(c(
    stopwords("en"), stopwords("fr"), stopwords("es"), stopwords("de"),
    stopwords("it"), stopwords("pt"), stopwords("nl"), stopwords("no"),
    tolower(c("project","programme","program","aid","loan","grant","credit",
              "world","bank","government","support","development","ministry",
              "department","agency","project’s","projects","projet", "financement", "china", "chinese", "indian", "usaid"))
  ))
)

# 2) pick top donors by amount (commit/disburse consistent with params$amount)
metric_vec <- if (amount_kind == "disb") proj$amount_disb else proj$amount_commit
top10_donors <- proj %>%
  mutate(metric = metric_vec) %>%
  filter(!is.na(donor), nzchar(donor)) %>%
  group_by(donor) %>% summarise(total = sum(metric, na.rm = TRUE), .groups="drop") %>%
  slice_max(total, n = 10) %>% pull(donor)

# 3) build corpus and tokenize
corpus <- proj %>%
  filter(donor %in% top10_donors, !is.na(description), nzchar(description)) %>%
  transmute(doc = donor, text = stringr::str_to_lower(description))

tokens <- corpus %>%
  tidytext::unnest_tokens(token, text, token = "words", strip_numeric = TRUE) %>%
  filter(stringr::str_detect(token, "^[a-z]{3,}$")) %>%   # ≥3 letters only
  anti_join(stop_tbl, by = "token")

# 4) TF–IDF (donor = document)
tfidf <- tokens %>%
  count(doc, token, sort = TRUE) %>%
  bind_tf_idf(term = token, document = doc, n = n)

# 5) Filter extremes: drop tokens in only 1 donor or in all donors
docfreq <- tfidf %>%
  group_by(token) %>% summarise(df = n_distinct(doc), .groups="drop")

n_docs <- n_distinct(tfidf$doc)

tfidf_clean <- tfidf %>%
  inner_join(docfreq, by = "token") %>%
  filter(df >= 2, df <= n_docs - 1)


# Stats for subtitle/caption
n_docs   <- dplyr::n_distinct(tfidf_clean$doc)
n_tokens <- dplyr::n_distinct(tfidf_clean$term %||% tfidf_clean$token)
sub_txt  <- paste0("Top TF–IDF tokens by donor • ", n_docs, " donors • ",
                   scales::comma(n_tokens), " unique tokens after filtering")

cap_txt  <- "TF–IDF computed with tidytext::bind_tf_idf(). Tokens are 3+ letters, lowercased; \
multilingual + domain stopwords removed; tokens appearing in only one donor or in all donors were dropped. \
Bars are ordered within facets."

# Ensure we have 'token' column name
if (!"token" %in% names(tfidf_clean)) tfidf_clean <- dplyr::rename(tfidf_clean, token = term)

top_tfidf <- tfidf_clean %>%
  dplyr::group_by(doc) %>%
  dplyr::slice_max(order_by = tf_idf, n = 10, with_ties = FALSE) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(token = tidytext::reorder_within(token, tf_idf, doc))

ggplot(top_tfidf, aes(tf_idf, token)) +
  geom_col(fill = "#3182bd", alpha = 0.9, width = 0.85) +
  tidytext::scale_y_reordered() +
  facet_wrap(~ doc, scales = "free_y", ncol = 2) +
  scale_x_continuous(labels = scales::label_number(accuracy = 0.001)) +
  labs(
    title    = "Top description tokens by TF–IDF (per donor)",
    subtitle = sub_txt,
    x = "TF–IDF", y = NULL,
    caption  = stringr::str_wrap(cap_txt, width = 95)
  ) +
  theme_beau() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    strip.text         = element_text(face = "bold"),
    plot.margin        = margin(t = 5, r = 10, b = 35, l = 5)
  )

4) Amounts — distributions & time profiles

# Long format for quick compare
amt_long <- bind_rows(
  proj %>% select(project_id, donor, iso3_rec, year = year_commit, amount = amount_commit) %>% mutate(kind="Commit"),
  proj %>% select(project_id, donor, iso3_rec, year = year_disb,   amount = amount_disb)   %>% mutate(kind="Disburse")
) %>% filter(!is.na(amount), amount > 0)

# Histogram of amounts (log scale)
sub_hist <- paste0(
  scales::comma(nrow(amt_long)), " project–amount observations • ",
  min(amt_long$year, na.rm=TRUE), "–", max(amt_long$year, na.rm=TRUE)
)

cap_hist <- "Histogram of project-level financial amounts (commitments/disbursements combined). \
X-axis is log-scaled (USD); each bin counts number of project–amount observations. \
Values ≤ 0 are excluded."

p_hist <- ggplot(amt_long, aes(amount)) +
  geom_histogram(bins = 60, fill = "#3182bd", color = "white", linewidth = 0.2, alpha = 0.9) +
  scale_x_log10(labels = fmt_dollar) +
  labs(
    title    = "Amount distribution (log scale)",
    subtitle = sub_hist,
    x = "USD (log10)", y = "Number of projects",
    caption  = stringr::str_wrap(cap_hist, width = 95)
  ) +
  theme_beau() +
  theme(
    panel.grid.minor = element_blank(),
    plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
  )
p_hist

# Violin by donor
top_donors <- amt_long %>%
  count(donor, sort = TRUE) %>%
  slice_head(n = 12) %>%
  pull(donor)

sub_violin <- paste0(
  "Top ", length(top_donors), " donor groups by number of projects • ",
  scales::comma(nrow(filter(amt_long, donor %in% top_donors))), " observations"
)

cap_violin <- "Each violin shows the distribution of project amounts for a donor, split by commitment vs disbursement. \
Widths indicate relative density; scale is log(USD). \
Only top donors by project count are shown."

p_violin <- amt_long %>%
  filter(donor %in% top_donors) %>%
  ggplot(aes(donor, amount, fill = kind)) +
  geom_violin(scale = "width", trim = TRUE, alpha = 0.8, color = "white", linewidth = 0.2) +
  scale_y_log10(labels = fmt_dollar) +
  coord_flip() +
  scale_fill_manual(values = c("Commit" = "#3182bd", "Disburse" = "#e6550d")) +
  labs(
    title    = "Amount distribution by donor (top groups)",
    subtitle = sub_violin,
    x = NULL, y = "USD (log10)", fill = NULL,
    caption  = stringr::str_wrap(cap_violin, width = 95)
  ) +
  theme_beau() +
  theme(
    legend.position = "top",
    panel.grid.minor = element_blank(),
    plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
  )
p_violin

Totals over time (commit vs disburse) — interactive

comm_ts <- proj %>%
  transmute(year = suppressWarnings(as.integer(year_commit)),
            total = suppressWarnings(as.numeric(amount_commit))) %>%
  filter(!is.na(year)) %>%
  group_by(year) %>%
  summarise(total = sum(total, na.rm = TRUE), .groups = "drop") %>%
  mutate(kind = "Commit")

disb_ts <- proj %>%
  transmute(year = suppressWarnings(as.integer(year_disb)),
            total = suppressWarnings(as.numeric(amount_disb))) %>%
  filter(!is.na(year)) %>%
  group_by(year) %>%
  summarise(total = sum(total, na.rm = TRUE), .groups = "drop") %>%
  mutate(kind = "Disburse")

ts_amt_long <- bind_rows(comm_ts, disb_ts) %>%
  arrange(kind, year)

# Guard: if nothing to show, bail early with a message
if (nrow(ts_amt_long) == 0) {
  stop("No annual totals available (both year_commit and year_disb are NA).")
}

# Nice subtitle/caption
yr_min <- min(ts_amt_long$year, na.rm = TRUE)
yr_max <- max(ts_amt_long$year, na.rm = TRUE)
sub_txt <- paste0("Annual totals (USD) • ", yr_min, "–", yr_max)
cap_txt <- "Lines show total commitments/disbursements per year (sum across all project–locations). Hover for values."

# Pre-format hover text
ts_amt_long <- ts_amt_long %>%
  mutate(hover = paste0(
    "<b>", kind, "</b><br>",
    "Year: ", year, "<br>",
    "Total: ", scales::label_dollar(scale_cut = scales::cut_short_scale())(total)
  ))

p <- ggplot(ts_amt_long, aes(x = year, y = total, color = kind, text = hover, group = kind)) +
  geom_line(linewidth = 1, na.rm = TRUE) +
  scale_color_manual(values = c("Commit" = "#3366CC", "Disburse" = "#DC3912")) +
  scale_y_continuous(labels = fmt_dollar, expand = expansion(mult = c(0.02, 0.08))) +
  scale_x_continuous(expand = expansion(mult = c(0.01, 0.01))) +
  labs(
    title = "Financial volumes over time",
    subtitle = sub_txt,
    x = NULL, y = "USD", color = NULL,
    caption = stringr::str_wrap(cap_txt, width = 95)
  ) +
  theme_beau() +
  theme(
    legend.position = "top",
    panel.grid.minor = element_blank(),
    plot.margin = margin(t = 5, r = 10, b = 40, l = 5)
  )

plotly::ggplotly(p, tooltip = "text") %>%
  plotly::layout(legend = list(orientation = "h", x = 0, y = 1.1))

5) Geographic distribution (country & coordinates)

5.1 Choropleth — Projects and volumes by recipient country

metric <- if (amount_kind == "disb") "disburse" else "commit"

by_cty <- proj %>% 
  group_by(iso3_rec) %>% 
  summarise( n_proj = n(), 
             commit = sum(amount_commit, na.rm=TRUE), 
             disburse = sum(amount_disb, na.rm=TRUE), .groups="drop" )

# world map 
world <- tryCatch( rnaturalearth::ne_countries(scale = "medium", returnclass = "sf"), error = function(e) NULL )

# subtitle & caption
yr_min <- suppressWarnings(min(proj$year_commit, proj$year_disb, na.rm = TRUE))
yr_max <- suppressWarnings(max(proj$year_commit, proj$year_disb, na.rm = TRUE))
covered <- sum(!is.na(by_cty[[metric]]) & by_cty[[metric]] > 0)
pct_cov <- covered / n_distinct(by_cty$iso3_rec)

sub_txt <- paste0(
  if (amount_kind == "disb") "Disbursements" else "Commitments",
  " in USD • ", yr_min, "–", yr_max, " • ",
  scales::percent(pct_cov, accuracy = 1), " of recipients with > 0"
)

cap_txt <- "Fill shows total amounts aggregated by recipient ISO3 across all years. Grey = no data or zero total. \
Values are on a log scale to improve contrast across large ranges. Basemap: Natural Earth."

if (!is.null(world)) {
  world <- world %>%
    dplyr::mutate(iso3_rec = dplyr::if_else(iso_a3 == "SDS", "SSD", iso_a3))

  map_df <- dplyr::left_join(world, by_cty, by = "iso3_rec")

  # set zeros to NA for better visual separation (keep outline)
  map_df$fill_val <- map_df[[metric]]
  map_df$fill_val[!is.na(map_df$fill_val) & map_df$fill_val <= 0] <- NA_real_

  p_map <- ggplot(map_df) +
    geom_sf(aes(fill = fill_val), color = "white", linewidth = 0.15) +
    scale_fill_viridis_c(
      name   = "USD (total)",
      labels = fmt_dollar,
      trans  = "log10",
      na.value = "grey92",
      guide = guide_colorbar(
        barheight = unit(10, "lines"),
        ticks = TRUE
      )
    ) +
    labs(
      title    = paste("Recipient country totals —", ifelse(amount_kind=="disb","Disbursements","Commitments")),
      subtitle = sub_txt,
      x = NULL, y = NULL,
      caption  = stringr::str_wrap(cap_txt, width = 95)
    ) +
    theme_beau() +
    theme(
      legend.position = "right",
      panel.grid = element_blank(),
      plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
    )

  if (!is.na(use_crs) && nzchar(use_crs)) {
    p_map <- tryCatch(p_map + coord_sf(crs = use_crs),
                      error = function(e) { message("Projection failed, using WGS84"); p_map })
  }

  p_map
} else {
  message("World basemap unavailable (no rnaturalearth). Skipping choropleth.")
}

5.2 Project locations — global dot map (+ density)

# build points as sf (WGS84)
pts <- proj %>%
  filter(has_coords, between(longitude, -180, 180), between(latitude, -90, 90))

pts_sf <- sf::st_as_sf(pts, coords = c("longitude","latitude"), crs = 4326, remove = FALSE)

# world basemap
base_world <- tryCatch(rnaturalearth::ne_countries(scale = "medium", returnclass = "sf"), error = function(e) NULL)
if (!is.null(base_world)) base_world <- subset(base_world, admin != "Antarctica")


# Diagnostics for subtitle
n_pts  <- nrow(pts_sf)
yr_min <- suppressWarnings(min(proj$year_commit, proj$year_disb, na.rm = TRUE))
yr_max <- suppressWarnings(max(proj$year_commit, proj$year_disb, na.rm = TRUE))
sub_txt <- paste0("All projects with coordinates • ", scales::comma(n_pts), " points • ", yr_min, "–", yr_max)

cap_pts <- "Each dot corresponds to a project–location row with usable latitude/longitude. \
Dots are semi-transparent to reduce overplotting, so darker regions indicate denser clusters. \
Basemap is simplified Natural Earth (no Antarctica)."

p_pts <- ggplot() +
  geom_sf(data = base_world, fill = "grey98", color = "grey85", linewidth = 0.2) +
  geom_sf(data = pts_sf, alpha = 0.25, size = 0.2, color = "steelblue") +
  labs(
    title = "Project point locations",
    subtitle = sub_txt,
    x = NULL, y = NULL,
    caption = stringr::str_wrap(cap_pts, width = 95)
  ) +
  theme_beau() +
  theme(
    legend.position = "none",
    panel.grid = element_blank(),
    plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
  )

# Optional projection
if (!is.na(use_crs) && nzchar(use_crs)) {
  p_pts <- tryCatch(p_pts + coord_sf(crs = use_crs),
                    error = function(e) { message("Projection failed, staying in WGS84"); p_pts })
}

p_pts

Hex version

if (!requireNamespace("maps", quietly = TRUE)) install.packages("maps", quiet = TRUE)
world_df <- ggplot2::map_data("world")


# Helpers for subtitle/caption
n_pts  <- nrow(pts)
yr_min <- suppressWarnings(min(proj$year_commit, proj$year_disb, na.rm = TRUE))
yr_max <- suppressWarnings(max(proj$year_commit, proj$year_disb, na.rm = TRUE))
sub_txt <- paste0("All projects with coordinates • ", n_pts, " points • ", yr_min, "–", yr_max)

cap_count <- "Counts are 2D hexbin tallies of project points in geographic (lon/lat) space. \
Higher counts indicate denser clusters of geocoded project locations. \
Basemap is simplified and drawn in WGS84 using quick mapping."

p_hex <- ggplot() +
  geom_polygon(
    data = world_df,
    aes(long, lat, group = group),
    fill = "grey98", color = "grey85", linewidth = 0.2
  ) +
  stat_bin_2d(
    data = pts,
    aes(x = longitude, y = latitude),
    bins = 130,     # tune as needed (80–160 works well)
    na.rm = TRUE
  ) +
  scale_fill_viridis_c(
    labels = fmt_si,
    name = "Projects",
    trans = "sqrt",          # improves contrast between low/high counts
    guide = guide_colorbar(barheight = unit(10, "lines"))
  ) +
  coord_quickmap() +
  labs(
    title = "Spatial density of project points (hex bins)",
    subtitle = sub_txt,
    x = NULL, y = NULL,
    caption = stringr::str_wrap(cap_count, width = 95)
  ) +
  theme_beau() +
  theme(
    legend.position = "right",
    panel.grid = element_blank(),
    plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
  )
p_hex

metric <- if (amount_kind == "disb") "amount_disb" else "amount_commit"
sub_amt <- paste0(
  if (amount_kind == "disb") "Disbursements" else "Commitments",
  " aggregated per hex • ", yr_min, "–", yr_max
)

cap_amt <- "Fill shows the sum of amounts (USD) within each hexagon; zeros are blank. \
Use with care: large urban clusters can dominate totals. \
Basemap is simplified and drawn in WGS84 using quick mapping."

p_hex_amt <- ggplot() +
  geom_polygon(
    data = world_df,
    aes(long, lat, group = group),
    fill = "grey98", color = "grey85", linewidth = 0.2
  ) +
  stat_summary_2d(
    data = pts,
    aes(x = longitude, y = latitude, z = .data[[metric]]),
    bins = 130,
    fun = function(z) sum(z, na.rm = TRUE),
    na.rm = TRUE
  ) +
  scale_fill_viridis_c(
    labels = fmt_dollar,
    name = "USD (sum)",
    trans = "sqrt",          # friendlier than log(10) for zeros/low cells
    guide = guide_colorbar(barheight = unit(10, "lines"))
  ) +
  coord_quickmap() +
  labs(
    title = "Amount-weighted project density",
    subtitle = sub_amt,
    x = NULL, y = NULL,
    caption = stringr::str_wrap(cap_amt, width = 95)
  ) +
  theme_beau() +
  theme(
    legend.position = "right",
    panel.grid = element_blank(),
    plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
  )
p_hex_amt

5.3 Recipient hot spots (top 25 by total amount)

library(countrycode)

metric <- if (amount_kind == "disb") "amount_disb" else "amount_commit"

top_cty <- proj %>%
  group_by(iso3_rec) %>%
  summarise(val = sum(.data[[metric]], na.rm = TRUE), .groups = "drop") %>%
  mutate(country = countrycode::countrycode(iso3_rec, "iso3c", "country.name"),
         country = ifelse(is.na(country), iso3_rec, country)) %>%
  arrange(desc(val)) %>%
  slice_head(n = 25) %>%
  mutate(
    share   = val / sum(val),
    country = forcats::fct_reorder(country, val)
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `country = countrycode::countrycode(iso3_rec, "iso3c",
##   "country.name")`.
## Caused by warning:
## ! Some values were not matched unambiguously: UNSPECIFIED, XKO, XKX
caption_text <- paste0(
  "Amounts are ", if (amount_kind == "disb") "disbursements" else "commitments in USD (as provided in the dataset), summed across all years and project–location rows. Only the Top-25 recipient countries are shown; the ‘share’ is within this Top-25 subset. Country names derived from ISO3 via {countrycode}."
)


ggplot(top_cty, aes(x = country, y = val)) +
  geom_segment(aes(xend = country, y = 0, yend = val), linewidth = 1.1, color = "grey75") +
  geom_point(aes(fill = val), shape = 21, size = 4, stroke = 0.2) +
  geom_text(
    aes(label = paste0(scales::label_dollar(scale_cut = scales::cut_short_scale())(val),
                       "  •  ", scales::percent(share, accuracy = 0.1))),
    hjust = -0.05, size = 3.2, color = "#333"
  ) +
  coord_flip(clip = "off") +
  scale_fill_viridis_c(guide = "none") +
  scale_y_continuous(labels = fmt_dollar, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Top 25 recipient countries by total amount",
    subtitle = if (amount_kind == "disb") "Disbursements" else "Commitments",
    x = NULL, y = "USD",
    caption = stringr::str_wrap(caption_text, width = 90)   # wrap caption
  ) +
  theme_beau() +
  theme(
    panel.grid.major.y = element_blank(),
    plot.margin = margin(t = 5, r = 30, b = 40, l = 5)      # more space at bottom
  )

6) Appendix — data hygiene

hyg <- tibble(
  pct_missing_coords = mean(!proj$has_coords),
  pct_missing_commit = mean(is.na(proj$amount_commit)),
  pct_missing_disb   = mean(is.na(proj$amount_disb)),
  pct_missing_year_c = mean(is.na(proj$year_commit)),
  pct_missing_year_d = mean(is.na(proj$year_disb))
)
DT::datatable(round(hyg, 3), caption = "Missingness summary (project-level)")

Notes

  • Years: I use startyear as “commit year” and paymentyear as “disb year”. If your dataset encodes different timing conventions, switch the assignments at the top.
  • Amounts: Using comm / disb (non-nominal). Swap to *_nominal if you want current USD.
  • Geography: gid_0 is treated as ISO3 for recipient. For choropleths I join to Natural Earth by iso_a3; a couple of codes (e.g., SSD) may need tiny fixes depending on your build.
  • Performance: If the CSV is too large, consider reading via Arrow (arrow::open_dataset) or save a feather/parquet for subsequent runs.